2021 ONE-Seq Focused Analysis Report

library(tidyverse)
library(here)
library(plotly)
library(scales)
library(tidyr)
library(kableExtra)
library(tibble)

PROJECT INFORMATION & METADATA

For more detailed project information please see accompanying report; “Metadata_Report.Rmd”

Information contained:

  • Project goals & summary
  • Sequencing metadata
  • File metadata
  • ONE-Seq pipeline run metadata

For additional, less-structured analyses please see accompanying report: “Analysis_Report.Rmd”.

Sample Overview

cat(readLines(here("data/ONE-Seq_Run_Data/README.md")), sep = '\n')
## The primary output files (scores.tsv and QC.csv) from ONEseq are copied here and renamed according to their naming schema.
## 
## *XXX00m*: XXX = wet-lab prep location, sequencing location, bioinformatics location, dataset number, *m*anually prepared
## 
## *N* = NIST *J* = Joung Lab
## 
## | Name   | Date    | Description                                                                                                 |
## |--------|---------|-------------------------------------------------------------------------------------------------------------|
## | JJJ01m | 2021-05 | UMI length of 8, SeQure data for target sites, run by Martin                                                |
## | JJN01m | NA      | UMI length of 8, SeQure data for target sites, run by Sierra                                                |
## | JJN02m | NA      | Replicate 1 of SeQure's HBB data, UMI = 8                                                                   |
## | JJN03m | NA      | Replicate 2 of SeQure's HBB data, UMI = 8                                                                   |
## | NNN01m | 2021-05 | Natasha's prep of samples with UMI=11 but run with UMI set to 8 - 'test experiments'                        |
## | NNN02m | 2021-05 | Same wet-lab samples as NNN01m but changed UMI param from 8 to 11 'replicate experiments - R1'              |
## | NNN03m | 2021-07 | Replicate 1 of Triplicate data for our 3 target sites generated July 2021 'replicate experiments - R1'      |
## | NNN04m | 2021-07 | Replicate 2 of Triplicate data for our 3 target sites generated July 2021 'replicate experiments - R1'      |
## | NNN05m | 2021-07 | Replicate 3 of Triplicate data for our 3 target sites generated July 2021 'replicate experiments - R1'      |
## | NNN06m | 2021-07 | Replicate 1 of Triplicate data for our 3 target sites generated September 2021 'replicate experiments - R2' |
## | NNN07m | 2021-07 | Replicate 2 of Triplicate data for our 3 target sites generated September 2021 'replicate experiments - R2' |
## | NNN08m | 2021-07 | Replicate 3 of Triplicate data for our 3 target sites generated September 2021 'replicate experiments - R2' |

LOAD DATA

Load & mung data

TARGET SITES: EMX1, FANCF, RNF2

########################################################################
# MAIN OUTPUT 
########################################################################
## get list of files in results directory
scores_file_list <- as.list(list.files(path = here("data/report_input"), 
                           pattern = "scores.tsv",
                           include.dirs = TRUE, full.names = TRUE, 
                           recursive = TRUE))
  
## strip the path portion of the file names
scores_file_names <- str_remove(scores_file_list, paste0(here("data/report_input"),"/"))

## organize the file list to read them into a df
scores_df_lst <- set_names(scores_file_list, scores_file_names)

## make lists into a data frame
scores_metadata_df <- tibble(scores_file = unlist(scores_file_names)) %>% 
  separate(scores_file, c("lab","target_site"), 
           sep = "_", remove = FALSE) %>%
  mutate(target_site = str_remove(target_site, ".scores.tsv"))

## read in files & annotate with the metadata
scores_df <- scores_df_lst %>%
  map_dfr(read_tsv, .id = "scores_file") %>% 
  ## if normalized proto AND normalized pam = 0 remove because 
  ## no signal to support that barcode/off-target
  filter(normalized_counts_pam + normalized_counts_proto > 0) %>%
  left_join(scores_metadata_df) 

########################################################################
# SUMMARY OUTPUT
########################################################################
## get list of files in results directory
qc_file_list <- as.list(list.files(path = here("data/report_input"), 
                           pattern = "QC.csv",
                           include.dirs = TRUE, full.names = TRUE, 
                           recursive = TRUE))

## strip the path portion of the file names
qc_file_names <- str_remove(qc_file_list, paste0(here("data/report_input"),"/"))

## organize the file list to read them into a df
qc_df_lst <- set_names(qc_file_list, qc_file_names)

## make lists into a data frame
qc_metadata_df <- tibble(qc_file = unlist(qc_file_names)) %>% 
  separate(qc_file, c("lab","target_site"), 
           sep = "_", remove = FALSE) %>%
  mutate(target_site = str_remove(target_site, ".QC.csv"))

## read in files & annotate with the metadata
qc_df <- qc_df_lst %>%
  map_df(read_csv, col_names = c("var","number_of_seqs"), .id = "sample") %>%
  mutate(var= str_remove(var, "number_of_")) %>% 
  mutate(proto_pam = str_extract(var, "pam")) %>% ## detect pam string and place pam in col
  mutate(proto_pam = replace_na(proto_pam, "proto")) %>% ## all NAs in this column will be "proto"
  mutate(var= str_remove(var, "_pam")) %>% ## remove strings to just the variables
  mutate(var= str_remove(var, "_proto")) %>%
  pivot_wider(names_from = proto_pam, values_from = number_of_seqs, values_fill = NA) %>%
  separate(sample,c("lab","target_site"), sep = "_", remove = FALSE) %>%
  select(-sample) %>%
  mutate(target_site = str_remove(target_site, ".QC.csv"))


########################################################################
# VIKRAM'S QC OUTPUT
########################################################################
################## QC SCORES FILE   
## get list of files in results directory
vikramqc_file_list <- as.list(list.files(path = here("data/Vikram_QC/report_input"), 
                           pattern = "QC.csv",
                           include.dirs = TRUE, full.names = TRUE, 
                           recursive = TRUE))
  
## strip the path portion of the file names
vikramqc_file_names <- str_remove(vikramqc_file_list, paste0(here("data/Vikram_QC/report_input"),"/"))

## organize the file list to read them into a df
vikramqc_df_lst <- set_names(vikramqc_file_list, vikramqc_file_names)

## make lists into a data frame
vikramqc_metadata_df <- tibble(vikramqc_file = unlist(vikramqc_file_names)) %>% 
  separate(vikramqc_file, c("lab","target_site"), 
           sep = "_", remove = FALSE) %>%
  mutate(target_site = str_remove(target_site, "QC.csv"))

## read in files & annotate with the metadata
vikramqc_df <- vikramqc_df_lst %>%
  map_dfr(read_csv, .id = "vikramqc_file") %>% 
  left_join(vikramqc_metadata_df) 
 
################## QC SUMMARY FILE   
## get list of files in results directory
vikramqc_summ_file_list <- as.list(list.files(path = here("data/Vikram_QC/report_input"), 
                           pattern = "QC.tsv",
                           include.dirs = TRUE, full.names = TRUE, 
                           recursive = TRUE))

## strip the path portion of the file names
vikramqc_summ_file_names <- str_remove(vikramqc_summ_file_list, paste0(here("data/Vikram_QC/report_input"),"/"))

## organize the file list to read them into a df
vikramqc_summ_df_lst <- set_names(vikramqc_summ_file_list, vikramqc_summ_file_names)

## make lists into a data frame
vikramqc_summ_metadata_df <- tibble(vikramqc_summ_file = unlist(vikramqc_summ_file_names)) %>% 
  separate(vikramqc_summ_file, c("lab","target_site"), 
           sep = "_", remove = FALSE) %>%
  mutate(target_site = str_remove(target_site, ".QC.tsv"))

## read in files & annotate with the metadata
vikramqc_summ_df <- vikramqc_summ_df_lst %>%
  map_df(read_tsv, .id = "sample") %>%
  separate(sample,c("lab","target_site"), sep = "_", remove = TRUE) 

########################################################################
# CALCULATING RAW READ COUNTS
########################################################################
# EQUATION FOR RAW READS 
## on target reads = 1/smallest non-zero normalized read count
## per row raw read count = on target read * that row's normalized read count 
scores_df <- scores_df %>%
  filter(!lab == "NNN01m") %>%
  group_by(lab, target_site) %>%
  ## note the smalled normalized counts for pam/proto to be used in calculation
  mutate(min_pam_norm_count = min(normalized_counts_pam[normalized_counts_pam>0]),
         min_proto_norm_count = min(normalized_counts_proto[normalized_counts_proto>0])) %>%
  mutate(pam_on_target_reads = 1/min_pam_norm_count, 
         proto_on_target_reads = 1/min_proto_norm_count) %>%
  rowwise() %>%
  mutate(pam_raw_read_count = normalized_counts_pam * pam_on_target_reads, 
         proto_raw_read_count = normalized_counts_proto * proto_on_target_reads) %>%
  select(-min_pam_norm_count, -min_proto_norm_count) 


########################################################################
# COLOR PALETTE 
########################################################################
## establish color palette for plots
cbPalette <- c("#000000", 
               "#E69F00", 
               "#56B4E9", 
               "#009E73", 
               "#F0E442", 
               "#0072B2", 
               "#D55E00", 
               "#CC79A7", 
               "#999999", 
               "#20F03B",
               "#FEC44F", 
               "#D95F0E", 
               "#756BB1", 
               "#FF8A33", 
               "#D7BE07", 
               "#A807D7")

########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT 
########################################################################
rm(qc_df_lst)
rm(qc_file_list)
rm(scores_df_lst)
rm(scores_file_list)
rm(vikramqc_df_lst)
rm(vikramqc_file_list)
rm(vikramqc_summ_df_lst)
rm(vikramqc_summ_file_list)

R1 REPLICATE COMPARISONS

Create a dataframe with R1 data only to use in this section. (NNN03m, NNN04m, NNN05m)

########################################################################
# DEFINE LABS
########################################################################
## create R1 df: NNN02, NNN03, NNN04
labs <- c("NNN03m", "NNN04m", "NNN05m")

########################################################################
# CREATE R1 SCORES DF
########################################################################
r1_scores_df <- scores_df %>%
  filter(lab %in% labs)

########################################################################
# CREATE R1 QC DF
########################################################################
## qc df
r1_qc_df <- qc_df %>%
  filter(lab %in% labs)

TOTAL NUMBER OF BARCODES

The total number of barcodes with at least 1 read for each triplicate & target site.

########################################################################
# CREATE MINIMAL DF THAT DESCRIBES TOTAL BARCODES
########################################################################
total_barcodes_plot_df <- r1_scores_df %>%
  group_by(target_site, lab) %>%
  count(target_site, lab,  name = "total_barcodes") %>%
  arrange(target_site, lab)

########################################################################
# PLOT
########################################################################
ggplotly(ggplot(total_barcodes_plot_df) + 
    geom_bar(aes(x = target_site, y = total_barcodes, fill = lab), 
    # dodge prserves vertical position, single groups bars of same metric together
    # stat = identity skips aggregation of values - I provide values
             position = position_dodge(preserve = "single"), stat = "identity") + 
    scale_y_continuous(labels = comma) + # add commas to axis label numbers
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 90)) + 
    labs(x = "Target Site", y = "Total Number of Barcodes", 
         fill = "Lab", main = "Total Number of Barcodes") +
      scale_fill_manual(values=cbPalette))
########################################################################
# TABLE
########################################################################
(total_barcodes_plot_df %>%
    pivot_wider(names_from = "lab",
              values_from = "total_barcodes", 
              values_fill = 0) %>%
  rowwise() %>%
  mutate(AVG = mean(c(NNN03m, NNN04m, NNN05m)),
         SD = sd(c(NNN03m, NNN04m, NNN05m)),
         CV = SD / AVG) %>%
  arrange(desc(CV)) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site NNN03m NNN04m NNN05m AVG SD CV
FANCF 17127 10916 9347 12463.33 4114.338 0.3301154
RNF2 14479 11270 14453 13400.67 1845.257 0.1376989
EMX1 23447 27021 25265 25244.33 1787.090 0.0707917
########################################################################
# REMOVE ENV VARS 
########################################################################
rm(total_barcodes_plot_df)

QC READ COUNTS

From the ONE-Seq QC_scores.csv file: The number of reads (proto & pam) in each QC category, plotted for each target site.

####################################################################
## FUNCTION TO MAKE PLOTS
####################################################################
make_qc_reads_plot <- function(df, targetsite){
  plot_df <- df %>%
    filter(target_site == {{targetsite}}) %>% 
    pivot_longer(names_to = "proto_pam", 
                 values_to = "read_count", 
                 cols = c("proto","pam")) %>%
    group_by(target_site)
  
  ## plot 
  ggplotly(ggplot(plot_df) + 
    geom_bar(aes(x = var, y = read_count, fill = lab), 
             position = position_dodge(preserve = "single"), stat = "identity") +
    scale_y_continuous(labels = comma) +
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 90)) + 
    facet_wrap(~proto_pam) + 
    labs(x = "Metric", y = "Read Count", 
         fill = "Lab", title = {{targetsite}}) + scale_fill_manual(values=cbPalette))
}  

####################################################################
## FUNCTION TO MAKE TABLE
####################################################################
make_qc_table <- function(df, targetsite){
  labs <- df$labs
  (plot_df <- df %>%
    filter(target_site == {{targetsite}}) %>% 
    pivot_longer(names_to = "proto_pam", values_to = "read_count", cols = c("proto","pam")) %>%
    group_by(target_site) %>%
    kbl() %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
}

EMX1

make_qc_reads_plot(r1_qc_df, "EMX1")
make_qc_table(r1_qc_df, "EMX1")
lab target_site var proto_pam read_count
NNN03m EMX1 seqs proto 489072
NNN03m EMX1 seqs pam 437017
NNN03m EMX1 seqs_with_primer proto 466871
NNN03m EMX1 seqs_with_primer pam 418686
NNN03m EMX1 seqs_with_umi proto 386126
NNN03m EMX1 seqs_with_umi pam 392840
NNN03m EMX1 seqs_with_libbarcode proto 343483
NNN03m EMX1 seqs_with_libbarcode pam 286664
NNN03m EMX1 seqs_with_target proto 420164
NNN03m EMX1 seqs_with_target pam 314442
NNN03m EMX1 seqs_passing_all_filters proto 324280
NNN03m EMX1 seqs_passing_all_filters pam 271902
NNN04m EMX1 seqs proto 770000
NNN04m EMX1 seqs pam 453853
NNN04m EMX1 seqs_with_primer proto 737277
NNN04m EMX1 seqs_with_primer pam 434914
NNN04m EMX1 seqs_with_umi proto 605297
NNN04m EMX1 seqs_with_umi pam 407884
NNN04m EMX1 seqs_with_libbarcode proto 534600
NNN04m EMX1 seqs_with_libbarcode pam 300442
NNN04m EMX1 seqs_with_target proto 655693
NNN04m EMX1 seqs_with_target pam 329496
NNN04m EMX1 seqs_passing_all_filters proto 506708
NNN04m EMX1 seqs_passing_all_filters pam 284894
NNN05m EMX1 seqs proto 676250
NNN05m EMX1 seqs pam 490001
NNN05m EMX1 seqs_with_primer proto 644761
NNN05m EMX1 seqs_with_primer pam 469711
NNN05m EMX1 seqs_with_umi proto 528627
NNN05m EMX1 seqs_with_umi pam 439024
NNN05m EMX1 seqs_with_libbarcode proto 464578
NNN05m EMX1 seqs_with_libbarcode pam 312312
NNN05m EMX1 seqs_with_target proto 572272
NNN05m EMX1 seqs_with_target pam 343725
NNN05m EMX1 seqs_passing_all_filters proto 438149
NNN05m EMX1 seqs_passing_all_filters pam 296486

FANCF

make_qc_reads_plot(r1_qc_df, "FANCF")
make_qc_table(r1_qc_df, "FANCF")
lab target_site var proto_pam read_count
NNN03m FANCF seqs proto 560255
NNN03m FANCF seqs pam 781033
NNN03m FANCF seqs_with_primer proto 536202
NNN03m FANCF seqs_with_primer pam 747295
NNN03m FANCF seqs_with_umi proto 449942
NNN03m FANCF seqs_with_umi pam 702493
NNN03m FANCF seqs_with_libbarcode proto 404072
NNN03m FANCF seqs_with_libbarcode pam 531995
NNN03m FANCF seqs_with_target proto 489640
NNN03m FANCF seqs_with_target pam 579930
NNN03m FANCF seqs_passing_all_filters proto 383699
NNN03m FANCF seqs_passing_all_filters pam 503640
NNN04m FANCF seqs proto 269079
NNN04m FANCF seqs pam 419329
NNN04m FANCF seqs_with_primer proto 255912
NNN04m FANCF seqs_with_primer pam 401859
NNN04m FANCF seqs_with_umi proto 216768
NNN04m FANCF seqs_with_umi pam 378782
NNN04m FANCF seqs_with_libbarcode proto 196217
NNN04m FANCF seqs_with_libbarcode pam 292696
NNN04m FANCF seqs_with_target proto 237303
NNN04m FANCF seqs_with_target pam 318702
NNN04m FANCF seqs_passing_all_filters proto 184659
NNN04m FANCF seqs_passing_all_filters pam 277495
NNN05m FANCF seqs proto 241539
NNN05m FANCF seqs pam 372237
NNN05m FANCF seqs_with_primer proto 228055
NNN05m FANCF seqs_with_primer pam 355724
NNN05m FANCF seqs_with_umi proto 191388
NNN05m FANCF seqs_with_umi pam 335440
NNN05m FANCF seqs_with_libbarcode proto 172498
NNN05m FANCF seqs_with_libbarcode pam 254841
NNN05m FANCF seqs_with_target proto 210319
NNN05m FANCF seqs_with_target pam 277710
NNN05m FANCF seqs_passing_all_filters proto 160805
NNN05m FANCF seqs_passing_all_filters pam 240830

RNF2

make_qc_reads_plot(r1_qc_df, "RNF2")
make_qc_table(r1_qc_df, "RNF2")
lab target_site var proto_pam read_count
NNN03m RNF2 seqs proto 504328
NNN03m RNF2 seqs pam 297314
NNN03m RNF2 seqs_with_primer proto 481040
NNN03m RNF2 seqs_with_primer pam 284814
NNN03m RNF2 seqs_with_umi proto 396864
NNN03m RNF2 seqs_with_umi pam 266692
NNN03m RNF2 seqs_with_libbarcode proto 345287
NNN03m RNF2 seqs_with_libbarcode pam 202339
NNN03m RNF2 seqs_with_target proto 430652
NNN03m RNF2 seqs_with_target pam 222730
NNN03m RNF2 seqs_passing_all_filters proto 325605
NNN03m RNF2 seqs_passing_all_filters pam 191742
NNN04m RNF2 seqs proto 180327
NNN04m RNF2 seqs pam 222795
NNN04m RNF2 seqs_with_primer proto 170052
NNN04m RNF2 seqs_with_primer pam 213663
NNN04m RNF2 seqs_with_umi proto 141928
NNN04m RNF2 seqs_with_umi pam 199321
NNN04m RNF2 seqs_with_libbarcode proto 122152
NNN04m RNF2 seqs_with_libbarcode pam 151119
NNN04m RNF2 seqs_with_target proto 150423
NNN04m RNF2 seqs_with_target pam 166572
NNN04m RNF2 seqs_passing_all_filters proto 113197
NNN04m RNF2 seqs_passing_all_filters pam 143297
NNN05m RNF2 seqs proto 366433
NNN05m RNF2 seqs pam 298248
NNN05m RNF2 seqs_with_primer proto 350485
NNN05m RNF2 seqs_with_primer pam 285330
NNN05m RNF2 seqs_with_umi proto 288418
NNN05m RNF2 seqs_with_umi pam 267038
NNN05m RNF2 seqs_with_libbarcode proto 245207
NNN05m RNF2 seqs_with_libbarcode pam 197039
NNN05m RNF2 seqs_with_target proto 304916
NNN05m RNF2 seqs_with_target pam 216541
NNN05m RNF2 seqs_passing_all_filters proto 232142
NNN05m RNF2 seqs_passing_all_filters pam 186157

ONE-SEQ SCORE (replicate scatter plots)

All barcodes that were found in both compared datasets are plotted as scatterplots to evaluate the similarity in ONE-Seq score.

NOTE These include only barcodes that are in both samples.

############################################################
## CREATE SUMMARY DF 
############################################################
r1_one_seq_score_plot_df <- r1_scores_df %>%
  ## retain only necessary cols, barcode will be identical between two datasets
  select(barcode, oneseq_score, lab, target_site) %>%
  ## list the oneseq score under Karl and NIST-2 accordingly
  pivot_wider(names_from = lab, values_from = oneseq_score, values_fill = NA) %>%
  ## drop any rows that contain NA because those were not found in one of the datasets
  ## comparing only those seqs in common between datasets
  drop_na() %>%
  group_by(target_site)

############################################################
## FUNCTION TO MAKE PLOTS
############################################################
## pass the above df and specify which cols to plot
make_oneseq_scatter_plot <- function(df, lab1, lab2){
  ## reduce df
  plot_df <- df %>%
  select(barcode, target_site, all_of(lab1), all_of(lab2))
  ## PLOT
  plot_df %>%
  ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) + 
  geom_point(alpha = 0.25) + 
  ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") + 
  facet_wrap(~target_site, scales = "free_x", ncol = 1) + 
  theme_bw() + 
  labs(x = lab1, 
       y =lab2, 
       title = "ONE-Seq Score of Common Barcodes") 
}

Replicate 1 vs. 2

make_oneseq_scatter_plot(r1_one_seq_score_plot_df, "NNN03m", "NNN04m")

Replicate 2 vs. 3

make_oneseq_scatter_plot(r1_one_seq_score_plot_df, "NNN04m", "NNN05m")

Replicate 1 vs. 3

make_oneseq_scatter_plot(r1_one_seq_score_plot_df, "NNN03m", "NNN05m")

############################################################
## REMOVE OBJECT
############################################################
rm(r1_one_seq_score_plot_df)

NORMALIZED COUNTS (replicate scatter plots)

Compare the normalized proto & pam counts that are found in the scores.tsv files.

NOTE only barcodes present in both samples are plotted.

## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
############################################################
## DF CURATION
############################################################
## PROTO
## EMX1, FANCF, RNF2
normalized_counts_proto_df <- r1_scores_df %>%
  select(barcode, normalized_counts_proto, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = normalized_counts_proto, values_fill = NA)

## PAM
## EMX1, FACNF, RNF2
normalized_counts_pam_df <- r1_scores_df %>%
  select(barcode, normalized_counts_pam, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = normalized_counts_pam, values_fill = NA)

############################################################
## FUNCTION
############################################################
########################## PROTO ########################## 
make.proto.norm.plot <- function(df, lab1, lab2){
  plot_df <- df %>%
  select(barcode, target_site, all_of(lab1), all_of(lab2)) %>%
  drop_na() %>%
  group_by(target_site)
  
  ## plot 
  plot_df %>%
  ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) + 
  geom_point(alpha = 0.25) + 
  ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") + 
  facet_wrap(~target_site, ncol = 1) + 
  theme_bw() + 
  labs(x = lab1, 
       y = lab2,
       title = "Normalized Counts of Common Barcodes: Proto")
}
########################## PAM ##########################
make.pam.norm.plot <- function(df, lab1, lab2){
  plot_df <- df %>%
  select(barcode, target_site, all_of(lab1), all_of(lab2)) %>%
  drop_na() %>%
  group_by(target_site)
  
  ## plot 
  plot_df %>%
  ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) + 
  geom_point(alpha = 0.25) + 
  ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") + 
  facet_wrap(~target_site, ncol = 1) + 
  theme_bw() + 
  labs(x = lab1, 
       y = lab2,
       title = "Normalized Counts of Common Barcodes: PAM")
}

Replicate 1 vs. 2

make.proto.norm.plot(normalized_counts_proto_df, "NNN03m", "NNN04m")

make.pam.norm.plot(normalized_counts_pam_df, "NNN03m", "NNN04m")

Replicate 2 vs. 3

make.proto.norm.plot(normalized_counts_proto_df, "NNN04m", "NNN05m")

make.pam.norm.plot(normalized_counts_pam_df, "NNN04m", "NNN05m")

Replicate 3 vs. 4

make.proto.norm.plot(normalized_counts_proto_df, "NNN03m", "NNN05m")

make.pam.norm.plot(normalized_counts_pam_df, "NNN03m", "NNN05m")

############################################################
## REMOVE OBJECTS
############################################################
rm(normalized_counts_proto_df)
rm(normalized_counts_pam_df)

RAW COUNTS (replicate scatter plots)

Compare the raw proto & pam counts that are calculated from the scores.tsv files.

NOTE only barcodes present in both samples are plotted.

## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
############################################################
## DF CURATION
############################################################

## PROTO
## EMX1, FACNF, RNF2
proto_raw_read_count_df <- r1_scores_df %>%
  select(barcode, proto_raw_read_count, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = proto_raw_read_count, values_fill = NA)

## PAM
## EMX1, FACNF, RNF2
pam_raw_read_count_df <- r1_scores_df %>%
  select(barcode, pam_raw_read_count, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = pam_raw_read_count, values_fill = NA)

##########################################################
## FUNCTION
##########################################################
############################## PROTO ##############################
make.proto.raw.counts.plot <- function(df, lab1, lab2){
  
  plot_df <- df %>%
  select(barcode, target_site, lab1, lab2) %>%
  drop_na() %>%
  group_by(target_site)

## plot 
plot_df %>%
  ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) + 
  geom_point(alpha = 0.25) + 
  ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") + 
  facet_wrap(~target_site, ncol = 1) + 
  theme_bw() + 
  labs(x = lab1, 
       y = lab2,
       title = "raw Counts of Common Barcodes: Proto")
}
############################## PAM ##############################
make.pam.raw.counts.plot <- function(df, lab1, lab2){
  
  plot_df <- df %>%
  select(barcode, target_site, lab1, lab2) %>%
  drop_na() %>%
  group_by(target_site)

## plot 
plot_df %>%
  ggplot(aes_string(x = {{lab1}}, y = {{lab2}})) + 
  geom_point(alpha = 0.25) + 
  ggpubr::stat_cor(aes(label = ..rr.label..), method = "pearson") + 
  facet_wrap(~target_site, ncol = 1) + 
  theme_bw() + 
  labs(x = lab1, 
       y = lab2,
       title = "raw Counts of Common Barcodes: PAM")
}

Replicate 1 vs. 2

make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN03m", "NNN04m")

make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN03m", "NNN04m")

Replicate 2 vs. 3

make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN05m", "NNN05m")

make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN05m", "NNN05m")

Replicate 1 vs. 3

make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN03m", "NNN05m")

make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN03m", "NNN05m")

##########################################################
## REMOVE OBJECTS
##########################################################
rm(proto_raw_read_count_df)
rm(pam_raw_read_count_df)

VENN BAR: PER TARGET BARCODE COMPARISON

How many barcodes are found among triplicates for each target site? NOTE no count values are taken into consideration.

########################################################################
# DF CURATION 
########################################################################
r1_venn_barcodes_df <- r1_scores_df %>%
  ## DEFINED AT BEGINNING OF R1 SECTION
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))

########################################################################
# PLOT 
########################################################################
ggplotly(ggplot(data = r1_venn_barcodes_df) + 
  geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set), 
           # position = "fill"
           position = "fill") + 
  coord_flip() + 
  theme_bw() + 
  labs(x = "Target Site", 
       y = "Proportion of Sites", 
       fill = "Triplicate Combinations", 
       title = "R1 Replicate Barcodes") +
    scale_fill_manual(values=cbPalette))
########################################################################
# TABLE 
########################################################################
(r1_venn_table_df <- r1_venn_barcodes_df  %>% 
  group_by(target_site) %>% 
  count(lab_set)  %>% 
  group_by(target_site) %>% 
  mutate(target_site_total = sum(n)) %>% 
  mutate(pct = round(n/target_site_total * 100, 0),
         count_value = paste0(n, " (", pct, "%)")) %>% 
  select(-n, -pct) %>% 
  pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site target_site_total NNN03m NNN03m-NNN04m NNN03m-NNN04m-NNN05m NNN03m-NNN05m NNN04m NNN04m-NNN05m NNN05m
EMX1 41662 4807 (12%) 4881 (12%) 9662 (23%) 4097 (10%) 6709 (16%) 5769 (14%) 5737 (14%)
FANCF 22144 6650 (30%) 3819 (17%) 3872 (17%) 2786 (13%) 2328 (11%) 897 (4%) 1792 (8%)
RNF2 21867 3248 (15%) 2288 (10%) 4964 (23%) 3979 (18%) 1878 (9%) 2140 (10%) 3370 (15%)
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT 
########################################################################
rm(r1_venn_table_df)

BARCODE CONCORDANCE TABLE

Table that breaks down the number of barcodes that are:

  • Not concordant (present in only 1 replicate)
  • Concordant between 2 replicates
  • Concordant in all 3 replicates

NOTE This is based on at least 1 [calculated] raw read to be considered present in the sample. In the future we may want to increase the read threshold to be considered.

########################################################################
# FUNCTION
########################################################################
make_concordance_table <- function(df, pam_proto){
  if(pam_proto == "proto"){
    
    (df %>%
      select(barcode, target_site, lab, proto_raw_read_count) %>%
      ## include only those with reads of at least 1
      filter(proto_raw_read_count > 0) %>%
      select(-proto_raw_read_count) %>%
      group_by(target_site, barcode) %>%
      ## get lab set to figure out how many labs each barcode is present in
      summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))  %>%
      mutate(lab_num = str_count(lab_set, "-") + 1) %>%
      group_by(target_site, lab_num) %>%
      tally() %>%
      mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
      pivot_wider(names_from = lab_num, 
                  values_from = n, 
                  values_fill = NA) %>%
      kbl() %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed", 
                        "responsive", html_font = "Cambria")))
    
  } else if(pam_proto == "pam"){
    
    (df %>%
      select(barcode, target_site, lab, pam_raw_read_count) %>%
      ## include only those with reads of at least 1
      filter(pam_raw_read_count > 0) %>%
      select(-pam_raw_read_count) %>%
      group_by(target_site, barcode) %>%
      ## get lab set to figure out how many labs each barcode is present in
      summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))  %>%
      mutate(lab_num = str_count(lab_set, "-") + 1) %>%
      group_by(target_site, lab_num) %>%
      tally() %>%
      mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
      pivot_wider(names_from = lab_num, 
                  values_from = n, 
                  values_fill = NA) %>%
      kbl() %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed", 
                        "responsive", html_font = "Cambria")))
    
    
  } else {
    message("Enter 'pam' or 'proto' for second arg")
  }
}

PROTO

make_concordance_table(r1_scores_df, "proto")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
target_site concordant_in_1 concordant_in_2 concordant_in_3
EMX1 15331 5563 2984
FANCF 7910 1973 1175
RNF2 8727 2909 968

PAM

make_concordance_table(r1_scores_df, "pam")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
target_site concordant_in_1 concordant_in_2 concordant_in_3
EMX1 17842 11169 5484
FANCF 10170 5689 2731
RNF2 9257 6340 2996

DENSITY PLOTS

Similar to the venn bar plots, visualize barcode overlap between samples with raw read counts plotted.

###########################################################################
####### FUNCTION
###########################################################################
  cbPalette <- c("#000000", 
                "#E69F00", 
                "#56B4E9", 
                "#009E73", 
                "#F0E442", 
                "#0072B2", 
                "#D55E00", 
                "#CC79A7",
                "#999999", 
                "#20F03B",
                "#FEC44F", 
                "#D95F0E", 
                "#756BB1", 
                "#FF8A33", 
                "#D7BE07", 
                "#A807D7",
                "dodgerblue2", 
                "#E31A1C", 
                "green4", 
                "#6A3D9A", 
                "#FF7F00", 
                "black", 
                "gold1", 
                "skyblue2",
                "#FB9A99", 
                "palegreen2", 
                "#CAB2D6", 
                "#FDBF6F", 
                "gray70", 
                "khaki2", 
                "maroon", 
                "orchid1",
                "deeppink1",  
                "blue1", 
                "steelblue4", 
                "darkturquoise", 
                "green1", 
                "yellow4", 
                "yellow3",
                "darkorange4", 
                "brown")

######################### PROTO
make.proto.density.plots <- function(df, lablist){
  labs <- c(lablist)
  
  density_plot_labset_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
  
  proto_density_plot_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site, proto_raw_read_count) %>%
  right_join(density_plot_labset_df) %>%
  count(target_site, lab_set, proto_raw_read_count, name = "Occurrence")
  
  options(scipen = 999)
  
  plot <- ggplot(proto_density_plot_df) + 
    geom_histogram(aes(x = proto_raw_read_count, fill = lab_set), position = "fill") + 
    scale_x_log10() + 
    facet_wrap(~target_site, scales = "free_y", ncol = 1) +
    theme_bw() +
    scale_fill_brewer(type = "qual") +
    labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Proto Raw Read Counts") +
    scale_fill_manual(values=cbPalette)
  
  return(plot)
}

########################## PAM

make.pam.density.plots <- function(df, lablist){
  labs <- c(lablist)
  
  density_plot_labset_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
  
  pam_density_plot_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site, pam_raw_read_count) %>%
  right_join(density_plot_labset_df) %>%
  count(target_site, lab_set, pam_raw_read_count, name = "Occurrence")
  
  options(scipen = 999)
  
  plot <- ggplot(pam_density_plot_df) + 
    geom_histogram(aes(x = pam_raw_read_count, fill = lab_set), position = "fill") + 
    scale_x_log10() + 
    facet_wrap(~target_site, scales = "free_y", ncol = 1) +
    theme_bw() +
    scale_fill_brewer(type = "qual") +
    labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "PAM Raw Read Counts") +
    scale_fill_manual(values=cbPalette)
  
  return(plot)
}

PROTO

lab_list <- c("NNN03m", "NNN04m", "NNN05m")
ggplotly(make.proto.density.plots(r1_scores_df, lab_list))

PAM

ggplotly(make.pam.density.plots(r1_scores_df, lab_list))

DENSITY PLOT: COMBINED PROTO & PAM

For each barcode of each triplicate, take the lowest read count value between proto and pam.

###########################################################################
####### GATHER MIN PAM/PROTO READ VALUE FOR EACH TRIPLICATE
###########################################################################
NNN03m_counts_df <- r1_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count, 
         proto_raw_read_count) %>%
  filter(lab == "NNN03m") %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count, 
                                        proto_raw_read_count,
                                        na.rm=TRUE)) %>%
  select(-pam_raw_read_count, -proto_raw_read_count)
  
NNN04m_counts_df <- r1_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count, 
         proto_raw_read_count) %>%
  filter(lab == "NNN04m") %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count, 
                                        proto_raw_read_count,
                                        na.rm=TRUE)) %>%
  select(-pam_raw_read_count, -proto_raw_read_count)

NNN05m_counts_df <- r1_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count, 
         proto_raw_read_count) %>%
  filter(lab == "NNN05m") %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count, 
                                        proto_raw_read_count,
                                        na.rm=TRUE)) %>%
  select(-pam_raw_read_count, -proto_raw_read_count)

###########################################################################
####### CAT EACH TRIPLICATE DF INTO 1 COHESIVE DF
###########################################################################
r1_combined_proto_pam_df <- plyr::rbind.fill(NNN03m_counts_df, 
                                             NNN04m_counts_df, 
                                             NNN05m_counts_df)

lab_list <- c("NNN03m", "NNN04m", "NNN05m")

###########################################################################
####### FUNCTION
###########################################################################
make.combined.density.plots <- function(df, lablist){
  labs <- c(lablist)
  
  density_plot_labset_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
  
  proto_density_plot_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site, min_raw_pam_proto_count) %>%
  right_join(density_plot_labset_df) %>%
  count(target_site, lab_set, min_raw_pam_proto_count, name = "Occurrence")
  
  options(scipen = 999)
  
  plot <- ggplot(proto_density_plot_df) + 
    geom_histogram(aes(x = min_raw_pam_proto_count, fill = lab_set), 
                   position = "fill") + 
    scale_x_log10() + 
    facet_wrap(~target_site, scales = "free_y", ncol = 1) +
    theme_bw() +
    scale_fill_brewer(type = "qual") +
    labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Combined Proto & PAM Raw Read Counts") +
    scale_fill_manual(values=cbPalette)
  
  return(plot)
}
make.combined.density.plots(r1_combined_proto_pam_df, lab_list)


R2 REPLICATE COMPARISONS

Create a df with R2 data only.

########################################################################
# DEFINE LABS
########################################################################
## create r2 df: NNN02, NNN03, NNN04
labs <- c("NNN06m", "NNN07m", "NNN08m")

########################################################################
# CREATE R2 SCORES DF
########################################################################
r2_scores_df <- scores_df %>%
  filter(lab %in% labs)

########################################################################
# CREATE R2 QC DF
########################################################################
r2_qc_df <- qc_df %>%
  filter(lab %in% labs)

TOTAL NUMBER OF BARCODES

The total number of barcodes with at least 1 read for each triplicate & target site.

########################################################################
# CREATE MINIMAL DF THAT DESCRIBES TOTAL BARCODES
########################################################################
total_barcodes_plot_df <- r2_scores_df %>%
  group_by(target_site, lab) %>%
  count(target_site, lab,  name = "total_barcodes") %>%
  arrange(target_site, lab)

########################################################################
# PLOT
######################################################################## 
ggplotly(ggplot(total_barcodes_plot_df) + 
    geom_bar(aes(x = target_site, y = total_barcodes, fill = lab), 
             # dodge prserves vertical position, single groups bars of same metric together
             # stat = identity skips aggregation of values - I provide values
             position = position_dodge(preserve = "single"), stat = "identity") + 
    scale_y_continuous(labels = comma) + # add commas to axis label numbers
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 90)) + 
    labs(x = "Target Site", y = "Total Number of Barcodes", 
         fill = "Lab", main = "Total Number of Barcodes") +
      scale_fill_manual(values=cbPalette))
########################################################################
# TABLE
########################################################################
(total_barcodes_plot_df %>%
    pivot_wider(names_from = "lab",
              values_from = "total_barcodes", 
              values_fill = 0) %>%
  rowwise() %>%
  mutate(AVG = mean(c(NNN06m, NNN07m, NNN08m)),
         SD = sd(c(NNN06m, NNN07m, NNN08m)),
         CV = SD / AVG) %>%
  arrange(desc(CV)) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site NNN06m NNN07m NNN08m AVG SD CV
EMX1 29895 25130 24803 26609.33 2850.1643 0.1071114
FANCF 13038 10535 11786 11786.33 1251.5000 0.1061823
RNF2 16074 15747 14851 15557.33 633.1764 0.0406995
########################################################################
# REMOVE ENV VARS 
########################################################################
rm(total_barcodes_plot_df)

QC READ COUNTS

From the ONE-Seq QC_scores.csv file: The number of reads (proto & pam) in each QC category, plotted for each target site.

EMX1

## FUNCTIONS ARE IN R1 SECTION
make_qc_reads_plot(r2_qc_df, "EMX1")
make_qc_table(r2_qc_df, "EMX1")
## Warning: Unknown or uninitialised column: `labs`.
lab target_site var proto_pam read_count
NNN06m EMX1 seqs proto 617312
NNN06m EMX1 seqs pam 775065
NNN06m EMX1 seqs_with_primer proto 589315
NNN06m EMX1 seqs_with_primer pam 742960
NNN06m EMX1 seqs_with_umi proto 492883
NNN06m EMX1 seqs_with_umi pam 703540
NNN06m EMX1 seqs_with_libbarcode proto 440147
NNN06m EMX1 seqs_with_libbarcode pam 539561
NNN06m EMX1 seqs_with_target proto 533292
NNN06m EMX1 seqs_with_target pam 586224
NNN06m EMX1 seqs_passing_all_filters proto 415366
NNN06m EMX1 seqs_passing_all_filters pam 510987
NNN07m EMX1 seqs proto 669362
NNN07m EMX1 seqs pam 678091
NNN07m EMX1 seqs_with_primer proto 642338
NNN07m EMX1 seqs_with_primer pam 652737
NNN07m EMX1 seqs_with_umi proto 541210
NNN07m EMX1 seqs_with_umi pam 618541
NNN07m EMX1 seqs_with_libbarcode proto 472278
NNN07m EMX1 seqs_with_libbarcode pam 448568
NNN07m EMX1 seqs_with_target proto 570877
NNN07m EMX1 seqs_with_target pam 488057
NNN07m EMX1 seqs_passing_all_filters proto 448146
NNN07m EMX1 seqs_passing_all_filters pam 427144
NNN08m EMX1 seqs proto 622705
NNN08m EMX1 seqs pam 629810
NNN08m EMX1 seqs_with_primer proto 596404
NNN08m EMX1 seqs_with_primer pam 606283
NNN08m EMX1 seqs_with_umi proto 500464
NNN08m EMX1 seqs_with_umi pam 573867
NNN08m EMX1 seqs_with_libbarcode proto 435471
NNN08m EMX1 seqs_with_libbarcode pam 413872
NNN08m EMX1 seqs_with_target proto 527930
NNN08m EMX1 seqs_with_target pam 451407
NNN08m EMX1 seqs_passing_all_filters proto 412352
NNN08m EMX1 seqs_passing_all_filters pam 394733

FANCF

## FUNCTIONS ARE IN R1 SECTION
make_qc_reads_plot(r2_qc_df, "FANCF")
make_qc_table(r2_qc_df, "FANCF")
## Warning: Unknown or uninitialised column: `labs`.
lab target_site var proto_pam read_count
NNN06m FANCF seqs proto 528619
NNN06m FANCF seqs pam 584983
NNN06m FANCF seqs_with_primer proto 506657
NNN06m FANCF seqs_with_primer pam 563253
NNN06m FANCF seqs_with_umi proto 435878
NNN06m FANCF seqs_with_umi pam 535807
NNN06m FANCF seqs_with_libbarcode proto 393694
NNN06m FANCF seqs_with_libbarcode pam 421416
NNN06m FANCF seqs_with_target proto 472142
NNN06m FANCF seqs_with_target pam 457092
NNN06m FANCF seqs_passing_all_filters proto 373918
NNN06m FANCF seqs_passing_all_filters pam 402064
NNN07m FANCF seqs proto 361966
NNN07m FANCF seqs pam 432023
NNN07m FANCF seqs_with_primer proto 346021
NNN07m FANCF seqs_with_primer pam 415374
NNN07m FANCF seqs_with_umi proto 297099
NNN07m FANCF seqs_with_umi pam 395643
NNN07m FANCF seqs_with_libbarcode proto 266494
NNN07m FANCF seqs_with_libbarcode pam 308255
NNN07m FANCF seqs_with_target proto 319474
NNN07m FANCF seqs_with_target pam 333736
NNN07m FANCF seqs_passing_all_filters proto 252081
NNN07m FANCF seqs_passing_all_filters pam 293366
NNN08m FANCF seqs proto 463516
NNN08m FANCF seqs pam 467310
NNN08m FANCF seqs_with_primer proto 445307
NNN08m FANCF seqs_with_primer pam 449773
NNN08m FANCF seqs_with_umi proto 385596
NNN08m FANCF seqs_with_umi pam 428270
NNN08m FANCF seqs_with_libbarcode proto 347129
NNN08m FANCF seqs_with_libbarcode pam 339337
NNN08m FANCF seqs_with_target proto 412601
NNN08m FANCF seqs_with_target pam 367216
NNN08m FANCF seqs_passing_all_filters proto 330488
NNN08m FANCF seqs_passing_all_filters pam 323445

RNF2

## FUNCTIONS ARE IN R1 SECTION
make_qc_reads_plot(r2_qc_df, "RNF2")
make_qc_table(r2_qc_df, "RNF2")
## Warning: Unknown or uninitialised column: `labs`.
lab target_site var proto_pam read_count
NNN06m RNF2 seqs proto 689586
NNN06m RNF2 seqs pam 443239
NNN06m RNF2 seqs_with_primer proto 662104
NNN06m RNF2 seqs_with_primer pam 427084
NNN06m RNF2 seqs_with_umi proto 556713
NNN06m RNF2 seqs_with_umi pam 404090
NNN06m RNF2 seqs_with_libbarcode proto 460917
NNN06m RNF2 seqs_with_libbarcode pam 293395
NNN06m RNF2 seqs_with_target proto 564109
NNN06m RNF2 seqs_with_target pam 319872
NNN06m RNF2 seqs_passing_all_filters proto 437965
NNN06m RNF2 seqs_passing_all_filters pam 279851
NNN07m RNF2 seqs proto 312944
NNN07m RNF2 seqs pam 286467
NNN07m RNF2 seqs_with_primer proto 299683
NNN07m RNF2 seqs_with_primer pam 276410
NNN07m RNF2 seqs_with_umi proto 249380
NNN07m RNF2 seqs_with_umi pam 261450
NNN07m RNF2 seqs_with_libbarcode proto 216234
NNN07m RNF2 seqs_with_libbarcode pam 203807
NNN07m RNF2 seqs_with_target proto 265057
NNN07m RNF2 seqs_with_target pam 221851
NNN07m RNF2 seqs_passing_all_filters proto 204899
NNN07m RNF2 seqs_passing_all_filters pam 194637
NNN08m RNF2 seqs proto 317635
NNN08m RNF2 seqs pam 341128
NNN08m RNF2 seqs_with_primer proto 304781
NNN08m RNF2 seqs_with_primer pam 328764
NNN08m RNF2 seqs_with_umi proto 256586
NNN08m RNF2 seqs_with_umi pam 311342
NNN08m RNF2 seqs_with_libbarcode proto 212667
NNN08m RNF2 seqs_with_libbarcode pam 233009
NNN08m RNF2 seqs_with_target proto 260487
NNN08m RNF2 seqs_with_target pam 254010
NNN08m RNF2 seqs_passing_all_filters proto 201734
NNN08m RNF2 seqs_passing_all_filters pam 222341

ONE-SEQ SCORE (replicate scatter plots)

All barcodes that were found in both compared datasets are plotted as scatterplots to evaluate the similarity in ONE-Seq score.

NOTE These include only barcodes that are in both samples.

############################################################
## CREATE SUMMARY DF 
############################################################
r2_one_seq_score_plot_df <- r2_scores_df %>%
  ## retain only necessary cols, barcode will be identical between two datasets
  select(barcode, oneseq_score, lab, target_site) %>%
  ## list the oneseq score under Karl and NIST-2 accordingly
  pivot_wider(names_from = lab, values_from = oneseq_score, values_fill = NA) %>%
  ## drop any rows that contain NA because those were not found in one of the datasets
  ## comparing only those seqs in common between datasets
  drop_na() %>%
  group_by(target_site)

Replicate 1 vs. 2

## FUNCTION IN R1 SECTION
make_oneseq_scatter_plot(r2_one_seq_score_plot_df, "NNN06m", "NNN07m")

Replicate 2 vs. 3

## FUNCTION IN R1 SECTION
make_oneseq_scatter_plot(r2_one_seq_score_plot_df, "NNN07m", "NNN08m")

Replicate 1 vs. 3

## FUNCTION IN R1 SECTION
make_oneseq_scatter_plot(r2_one_seq_score_plot_df, "NNN06m", "NNN08m")

############################################################
## REMOVE OBJECT
############################################################
rm(r2_one_seq_score_plot_df)

NORMALIZED COUNTS (replicate scatter plots)

Compare the normalized proto & pam counts that are found in the scores.tsv files.

NOTE only barcodes present in both samples are plotted.

## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
############################################################
## DF CURATION
############################################################
################################### PROTO ##################
## EMX1, FANCF, RNF2
normalized_counts_proto_df <- r2_scores_df %>%
  select(barcode, normalized_counts_proto, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = normalized_counts_proto, values_fill = NA)

################################### PAM ####################
## EMX1, FACNF, RNF2
normalized_counts_pam_df <- r2_scores_df %>%
  select(barcode, normalized_counts_pam, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = normalized_counts_pam, values_fill = NA)

Replicate 1 vs. 2

## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(normalized_counts_proto_df, "NNN06m", "NNN07m")

make.pam.norm.plot(normalized_counts_pam_df, "NNN06m", "NNN07m")

Replicate 2 vs. 3

## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(normalized_counts_proto_df, "NNN07m", "NNN08m")

make.pam.norm.plot(normalized_counts_pam_df, "NNN07m", "NNN08m")

Replicate 3 vs. 4

## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(normalized_counts_proto_df, "NNN06m", "NNN08m")

make.pam.norm.plot(normalized_counts_pam_df, "NNN06m", "NNN08m")

############################################################
## REMOVE OBJECTS
############################################################
rm(normalized_counts_proto_df)
rm(normalized_counts_pam_df)

RAW COUNTS (replicate scatter plots)

Compare the raw proto & pam counts that are calculated from the scores.tsv files.

NOTE only barcodes present in both samples are plotted.

## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
#############################################
## DF CURATION
#############################################
######################### PROTO
## EMX1, FACNF, RNF2
proto_raw_read_count_df <- r2_scores_df %>%
  select(barcode, proto_raw_read_count, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = proto_raw_read_count, values_fill = NA)

######################### PAM
## EMX1, FACNF, RNF2
pam_raw_read_count_df <- r2_scores_df %>%
  select(barcode, pam_raw_read_count, lab, target_site) %>%
  pivot_wider(names_from = lab, values_from = pam_raw_read_count, values_fill = NA)

Replicate 1 vs. 2

## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN06m", "NNN07m")

make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN06m", "NNN07m")

Replicate 2 vs. 3

## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN07m", "NNN08m")

make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN07m", "NNN08m")

Replicate 1 vs. 3

## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(proto_raw_read_count_df, "NNN06m", "NNN08m")

make.pam.raw.counts.plot(pam_raw_read_count_df, "NNN06m", "NNN08m")

##########################################################
## REMOVE OBJECTS
##########################################################
rm(proto_raw_read_count_df)
rm(pam_raw_read_count_df)

VENN BAR: PER TARGET BARCODE COMPARISON

How many barcodes are found among triplicates for each target site? NOTE no count values are taken into consideration.

##########################################################
## DF CURATION
##########################################################
r2_venn_barcodes_df <- r2_scores_df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))

##########################################################
## PLOT
##########################################################
ggplotly(ggplot(data = r2_venn_barcodes_df) + 
  geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set), 
           # position = "fill"
           position = "fill") + 
  coord_flip() + 
  theme_bw() + 
  labs(x = "Target Site", 
       y = "Proportion of Sites", 
       fill = "Triplicate Combinations", 
       title = "R2 Replicate Barcodes") +
    scale_fill_manual(values=cbPalette))
##########################################################
## TABLE
##########################################################
(r2_venn_table_df <- r2_venn_barcodes_df  %>% 
  group_by(target_site) %>% 
  count(lab_set)  %>% 
  group_by(target_site) %>% 
  mutate(target_site_total = sum(n)) %>% 
  mutate(pct = round(n/target_site_total * 100, 0),
         count_value = paste0(n, " (", pct, "%)")) %>% 
  select(-n, -pct) %>% 
  pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site target_site_total NNN06m NNN06m-NNN07m NNN06m-NNN07m-NNN08m NNN06m-NNN08m NNN07m NNN07m-NNN08m NNN08m
EMX1 42608 7207 (17%) 6004 (14%) 10831 (25%) 5853 (14%) 4594 (11%) 3701 (9%) 4418 (10%)
FANCF 20772 4155 (20%) 2145 (10%) 3975 (19%) 2763 (13%) 2686 (13%) 1729 (8%) 3319 (16%)
RNF2 23262 2575 (11%) 3379 (15%) 7255 (31%) 2865 (12%) 2457 (11%) 2656 (11%) 2075 (9%)
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT 
########################################################################
rm(r2_venn_table_df)

BARCODE CONCORDANCE TABLE

Table that breaks down the number of barcodes that are:

  • Not concordant (present in only 1 replicate)
  • Concordant between 2 replicates
  • Concordant in all 3 replicates

NOTE This is based on at least 1 [calculated] raw read to be considered present in the sample. In the future we may want to increase the read threshold to be considered.

PROTO

## FUNCTION DEFINED IN R1 SECTION
make_concordance_table(r2_scores_df, "proto")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
target_site concordant_in_1 concordant_in_2 concordant_in_3
EMX1 14110 4514 2611
FANCF 7645 2244 1470
RNF2 8721 3148 1236

PAM

## FUNCTION DEFINED IN R1 SECTION
make_concordance_table(r2_scores_df, "pam")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
target_site concordant_in_1 concordant_in_2 concordant_in_3
EMX1 17296 12897 7367
FANCF 9438 4641 2525
RNF2 8269 7617 4837

DENSITY PLOTS

Similar to the venn bar plots, visualize barcode overlap between samples with raw read counts plotted.

PROTO

## FUNCTION DEFINED IN R1 SECTION
lab_list <- c("NNN06m", "NNN07m", "NNN08m")
ggplotly(make.proto.density.plots(r2_scores_df, lab_list))

PAM

## FUNCTION DEFINED IN R1 SECTION
ggplotly(make.pam.density.plots(r2_scores_df, lab_list))

DENSITY PLOT: COMBINED PROTO & PAM

For each barcode of each triplicate, take the lowest read count value between proto and pam.

###########################################################################
####### GATHER MIN PAM/PROTO READ VALUE FOR EACH TRIPLICATE
###########################################################################
NNN06m_counts_df <- r2_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count, 
         proto_raw_read_count) %>%
  filter(lab == "NNN06m") %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count, 
                                        proto_raw_read_count,
                                        na.rm=TRUE)) %>%
  select(-pam_raw_read_count, -proto_raw_read_count)
  
NNN07m_counts_df <- r2_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count, 
         proto_raw_read_count) %>%
  filter(lab == "NNN07m") %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count, 
                                        proto_raw_read_count,
                                        na.rm=TRUE)) %>%
  select(-pam_raw_read_count, -proto_raw_read_count)

NNN08m_counts_df <- r2_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count, 
         proto_raw_read_count) %>%
  filter(lab == "NNN08m") %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(min_raw_pam_proto_count = pmin(pam_raw_read_count, 
                                        proto_raw_read_count,
                                        na.rm=TRUE)) %>%
  select(-pam_raw_read_count, -proto_raw_read_count)

###########################################################################
####### CAT EACH TRIPLICATE DF INTO 1 COHESIVE DF
###########################################################################
r2_combined_proto_pam_df <- plyr::rbind.fill(NNN06m_counts_df, 
                                             NNN07m_counts_df, 
                                             NNN08m_counts_df)

lab_list <- c("NNN06m", "NNN07m", "NNN08m")

###########################################################################
####### FUNCTION
###########################################################################
make.combined.density.plots <- function(df, lablist){
  labs <- c(lablist)
  
  density_plot_labset_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
  
  proto_density_plot_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site, min_raw_pam_proto_count) %>%
  right_join(density_plot_labset_df) %>%
  count(target_site, lab_set, min_raw_pam_proto_count, name = "Occurrence")
  
  options(scipen = 999)
  
  plot <- ggplot(proto_density_plot_df) + 
    geom_histogram(aes(x = min_raw_pam_proto_count, fill = lab_set), 
                   position = "fill") + 
    scale_x_log10() + 
    facet_wrap(~target_site, scales = "free_y", ncol = 1) +
    theme_bw() +
    scale_fill_brewer(type = "qual") +
    labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Combined Proto & PAM Raw Read Counts") +
    scale_fill_manual(values=cbPalette)
  
  return(plot)
}
make.combined.density.plots(r2_combined_proto_pam_df, lab_list)


R1 vs. R2

For each set of triplicates (R1 and R2) take the lowest non-zero value among all triplicates for each barcode. If there is at least 1 read count among any of the triplicates, that value is recorded.

Example data frame showing how selection occurs

Concatenate triplicates for each target site so we can compare R1 to R2 as a whole.

## create r2 df: NNN02, NNN03, NNN04
labs <- c("NNN03m", "NNN04m", "NNN05m", "NNN06m", "NNN07m", "NNN08m")

##########################################################
##########################  R1 ###########################  
##########################################################
############### CREATE DF FOR EACH CATEGORY ##############
####### NORMALIZED PAM
r1_norm_pam_df <- r1_scores_df %>%
  select(barcode, normalized_counts_pam, target_site) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = normalized_counts_pam,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_norm_pam_count = pmin(NNN03m, NNN04m, NNN05m,
                                      na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)

####### NORMALIZED PROTO
r1_norm_proto_df <- r1_scores_df %>%
  select(barcode, normalized_counts_proto, target_site) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = normalized_counts_proto,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_norm_proto_count = pmin(NNN03m, NNN04m, NNN05m, 
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)
  
####### PAM ON TARGET
r1_pam_target_df <- r1_scores_df %>%
  select(barcode, target_site, pam_on_target_reads) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = pam_on_target_reads,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_pam_on_target_read = pmin(NNN03m, NNN04m, NNN05m, 
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)

####### PROTO ON TARGET
r1_proto_target_df <- r1_scores_df %>%
  select(barcode, target_site, proto_on_target_reads) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = proto_on_target_reads,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_proto_on_target_read = pmin(NNN03m, NNN04m, NNN05m, 
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)
  
####### RAW PAM
r1_raw_pam_df <- r1_scores_df %>%
  select(barcode, target_site, pam_raw_read_count) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = pam_raw_read_count,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_pam_raw_read_count = pmin(NNN03m, NNN04m, NNN05m, 
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)
  
####### RAW PROTO
r1_raw_proto_df <- r1_scores_df %>%
  select(barcode, target_site, proto_raw_read_count) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = proto_raw_read_count,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_proto_raw_read_count = pmin(NNN03m, NNN04m, NNN05m, 
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)
  
####### ONESEQ SCORE
r1_oneseq_df <- r1_scores_df %>%
  select(barcode, oneseq_score, target_site) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = oneseq_score,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_min_oneseq_score = pmin(NNN03m, NNN04m, NNN05m, 
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)

############### COMBINE DFS ##############
r1_combined_df <- r1_norm_pam_df %>%
  left_join(r1_norm_proto_df) %>%
  left_join(r1_pam_target_df) %>%
  left_join(r1_proto_target_df) %>%
  left_join(r1_raw_pam_df) %>%
  left_join(r1_raw_proto_df) %>%
  left_join(r1_oneseq_df) %>%
  mutate(dataset = "R1") %>%
  rename(min_norm_pam_count = R1_min_norm_pam_count, 
         min_norm_proto_count = R1_min_norm_proto_count, 
         min_pam_on_target_read = R1_min_pam_on_target_read, 
         min_proto_on_target_read = R1_min_proto_on_target_read, 
         min_oneseq_score = R1_min_oneseq_score, 
         min_pam_raw_read_count = R1_min_pam_raw_read_count,
         min_proto_raw_read_count = R1_min_proto_raw_read_count)
  
##########################################################
##########################  R1 ###########################  
##########################################################
############### CREATE DF FOR EACH CATEGORY ##############
####### NORMALIZED PAM
r2_norm_pam_df <- r2_scores_df %>%
  select(barcode, normalized_counts_pam, target_site) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = normalized_counts_pam,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_norm_pam_count = pmin(NNN06m, NNN07m, NNN08m,
                                      na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)

####### NORMALIZED PROTO
r2_norm_proto_df <- r2_scores_df %>%
  select(barcode, normalized_counts_proto, target_site) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = normalized_counts_proto,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_norm_proto_count = pmin(NNN06m, NNN07m, NNN08m, 
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)
  
####### ON-TARGET PAM
r2_pam_target_df <- r2_scores_df %>%
  select(barcode, target_site, pam_on_target_reads) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = pam_on_target_reads,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_pam_on_target_read = pmin(NNN06m, NNN07m, NNN08m, 
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)

####### ON-TARGET PROTO
r2_proto_target_df <- r2_scores_df %>%
  select(barcode, target_site, proto_on_target_reads) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = proto_on_target_reads,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_proto_on_target_read = pmin(NNN06m, NNN07m, NNN08m, 
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)
  
####### RAW PAM
r2_raw_pam_df <- r2_scores_df %>%
  select(barcode, target_site, pam_raw_read_count) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = pam_raw_read_count,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_pam_raw_read_count = pmin(NNN06m, NNN07m, NNN08m, 
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)
  
####### RAW PROTO
r2_raw_proto_df <- r2_scores_df %>%
  select(barcode, target_site, proto_raw_read_count) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = proto_raw_read_count,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_proto_raw_read_count = pmin(NNN06m, NNN07m, NNN08m, 
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)
  
####### ONESEQ SCORE
r2_oneseq_df <- r2_scores_df %>%
  select(barcode, oneseq_score, target_site) %>%
  group_by(barcode, target_site) %>%
  pivot_wider(names_from = lab,
              values_from = oneseq_score,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(r2_min_oneseq_score = pmin(NNN06m, NNN07m, NNN08m, 
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m)

############### COMBINE DFS ##############
r2_combined_df <- r2_norm_pam_df %>%
  left_join(r2_norm_proto_df) %>%
  left_join(r2_pam_target_df) %>%
  left_join(r2_proto_target_df) %>%
  left_join(r2_raw_pam_df) %>%
  left_join(r2_raw_proto_df) %>%
  left_join(r2_oneseq_df) %>%
  mutate(dataset = "R2") %>%
  rename(min_norm_pam_count = r2_min_norm_pam_count, 
         min_norm_proto_count = r2_min_norm_proto_count, 
         min_pam_on_target_read = r2_min_pam_on_target_read, 
         min_proto_on_target_read = r2_min_proto_on_target_read, 
         min_oneseq_score = r2_min_oneseq_score, 
         min_pam_raw_read_count = r2_min_pam_raw_read_count,
         min_proto_raw_read_count = r2_min_proto_raw_read_count) 

##########################################################
## CREATE A COMBINED R1/R2 DF ##
##########################################################
r1_r2_scores_df <- plyr::rbind.fill(r1_combined_df, r2_combined_df)
  
########################################################
## REMOVE UNNECESSARY ENV VARS ##
########################################################
rm(r1_norm_pam_df)
rm(r1_norm_proto_df)
rm(r1_pam_target_df)
rm(r1_proto_target_df)
rm(r1_raw_pam_df)
rm(r1_raw_proto_df)
rm(r1_oneseq_df)
rm(r2_norm_pam_df)
rm(r2_norm_proto_df)
rm(r2_pam_target_df)
rm(r2_proto_target_df)
rm(r2_raw_pam_df)
rm(r2_raw_proto_df)
rm(r2_oneseq_df)

TOTAL NUMBER OF BARCODES

The total number of barcodes with at least 1 read for each sample & target site. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.

######################################################################
## curate summary df to plot number of off-target sits per target site & lab
r1_total_off_target_sites_plot_df <- r1_combined_df %>%
  group_by(target_site) %>%
  count(target_site, name = "r1_total_off_target_sites") %>%
  arrange(target_site)

r2_total_off_target_sites_plot_df <- r2_combined_df %>%
  group_by(target_site) %>%
  count(target_site, name = "r2_total_off_target_sites") %>%
  arrange(target_site)

r1_r2_total_off_target_sites_df <- r1_total_off_target_sites_plot_df %>%
  left_join(r2_total_off_target_sites_plot_df) %>%
  pivot_longer(cols = c("r1_total_off_target_sites",
                        "r2_total_off_target_sites"), 
                        names_to = "replicate", values_to = "value")%>%
  mutate(replicate = str_remove(replicate, "_total_off_target_sites")) %>%
  mutate(replicate = str_replace(replicate, "r1", "R1"),
         replicate = str_replace(replicate, "r2", "R2"))

##########
## plot ##
##########
ggplotly(ggplot(r1_r2_total_off_target_sites_df) + 
    geom_bar(aes(x = target_site, y = value, fill = replicate), 
             # dodge prserves vertical position, single groups bars of same metric together
             # stat = identity skips aggregation of values - I provide values
             position = position_dodge(preserve = "single"), stat = "identity") + 
    scale_y_continuous(labels = comma) + # add commas to axis label numbers
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 90)) + 
    labs(x = "Target Site", y = "Total Number of Sites", 
         fill = "Lab", main = "Total Number of Barcodes") +
      scale_fill_manual(values=cbPalette))
(r1_r2_total_off_target_sites_df %>%
    rename(total_sites = value) %>%
    pivot_wider(names_from = "replicate",
              values_from = "total_sites", 
              values_fill = 0) %>%
  rowwise() %>%
  mutate(AVG = mean(c(R1, R2)),
         SD = sd(c(R1, R2)),
         CV = SD / AVG) %>%
  arrange(desc(CV)) %>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site R1 R2 AVG SD CV
FANCF 22144 20772 21458.0 970.1505 0.0452116
RNF2 21867 23262 22564.5 986.4140 0.0437153
EMX1 41662 42608 42135.0 668.9230 0.0158757
########################################################
## REMOVE UNNECESSARY ENV VARS ##
########################################################
rm(r1_total_off_target_sites_plot_df)
rm(r2_total_off_target_sites_plot_df)

QC READ COUNTS

From the ONE-Seq QC_scores.csv file: The number of reads (proto & pam) in each QC category. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.

########################################################
## CURATE R1 DF
########################################################
r1_proto_qc_df <- r1_qc_df %>%
  select(-pam) %>%
  pivot_wider(names_from = lab,
              values_from = proto,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_proto_value = pmin(NNN03m, NNN04m, NNN05m, na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m) %>%
  mutate(lab = "R1") %>%
  rename(proto = R1_proto_value)

r1_pam_qc_df <- r1_qc_df %>%
  select(-proto) %>%
  pivot_wider(names_from = lab,
              values_from = pam,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R1_pam_value = pmin(NNN03m, NNN04m, NNN05m, na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m)%>%
  mutate(lab = "R1") %>%
  rename(pam = R1_pam_value)

r1_combined_qc_df <- r1_proto_qc_df %>%
  left_join(r1_pam_qc_df)
## Joining, by = c("target_site", "var", "lab")
########################################################
## CURATE R2 DF
########################################################
r2_proto_qc_df <- r2_qc_df %>%
  select(-pam) %>%
  pivot_wider(names_from = lab,
              values_from = proto,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R2_proto_value = pmin(NNN06m, NNN07m, NNN08m, na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m) %>%
  mutate(lab = "R2") %>%
  rename(proto = R2_proto_value)

r2_pam_qc_df <- r2_qc_df %>%
  select(-proto) %>%
  pivot_wider(names_from = lab,
              values_from = pam,
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(R2_pam_value = pmin(NNN06m, NNN07m, NNN08m, na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m) %>%
  mutate(lab = "R2") %>%
  rename(pam = R2_pam_value)

r2_combined_qc_df <- r2_proto_qc_df %>%
  left_join(r2_pam_qc_df)  
## Joining, by = c("target_site", "var", "lab")
########################################################
## JOIN R1 R2 DFS
########################################################
r1_r2_qc_df <- plyr::rbind.fill(r1_combined_qc_df, r2_combined_qc_df)
  
########################################################
## REMOVE UNNECESSARY ENV VARS ##
########################################################
rm(r1_proto_qc_df)
rm(r1_pam_qc_df)
rm(r2_proto_qc_df)
rm(r2_pam_qc_df)
rm(r1_combined_qc_df)
rm(r2_combined_qc_df)

EMX1

make_qc_reads_plot(r1_r2_qc_df, "EMX1")
make_qc_table(r1_r2_qc_df, "EMX1")
target_site var lab proto_pam read_count
EMX1 seqs R1 proto 489072
EMX1 seqs R1 pam 437017
EMX1 seqs_with_primer R1 proto 466871
EMX1 seqs_with_primer R1 pam 418686
EMX1 seqs_with_umi R1 proto 386126
EMX1 seqs_with_umi R1 pam 392840
EMX1 seqs_with_libbarcode R1 proto 343483
EMX1 seqs_with_libbarcode R1 pam 286664
EMX1 seqs_with_target R1 proto 420164
EMX1 seqs_with_target R1 pam 314442
EMX1 seqs_passing_all_filters R1 proto 324280
EMX1 seqs_passing_all_filters R1 pam 271902
EMX1 seqs R2 proto 617312
EMX1 seqs R2 pam 629810
EMX1 seqs_with_primer R2 proto 589315
EMX1 seqs_with_primer R2 pam 606283
EMX1 seqs_with_umi R2 proto 492883
EMX1 seqs_with_umi R2 pam 573867
EMX1 seqs_with_libbarcode R2 proto 435471
EMX1 seqs_with_libbarcode R2 pam 413872
EMX1 seqs_with_target R2 proto 527930
EMX1 seqs_with_target R2 pam 451407
EMX1 seqs_passing_all_filters R2 proto 412352
EMX1 seqs_passing_all_filters R2 pam 394733

FANCF

make_qc_reads_plot(r1_r2_qc_df, "FANCF")
make_qc_table(r1_r2_qc_df, "FANCF")
target_site var lab proto_pam read_count
FANCF seqs R1 proto 241539
FANCF seqs R1 pam 372237
FANCF seqs_with_primer R1 proto 228055
FANCF seqs_with_primer R1 pam 355724
FANCF seqs_with_umi R1 proto 191388
FANCF seqs_with_umi R1 pam 335440
FANCF seqs_with_libbarcode R1 proto 172498
FANCF seqs_with_libbarcode R1 pam 254841
FANCF seqs_with_target R1 proto 210319
FANCF seqs_with_target R1 pam 277710
FANCF seqs_passing_all_filters R1 proto 160805
FANCF seqs_passing_all_filters R1 pam 240830
FANCF seqs R2 proto 361966
FANCF seqs R2 pam 432023
FANCF seqs_with_primer R2 proto 346021
FANCF seqs_with_primer R2 pam 415374
FANCF seqs_with_umi R2 proto 297099
FANCF seqs_with_umi R2 pam 395643
FANCF seqs_with_libbarcode R2 proto 266494
FANCF seqs_with_libbarcode R2 pam 308255
FANCF seqs_with_target R2 proto 319474
FANCF seqs_with_target R2 pam 333736
FANCF seqs_passing_all_filters R2 proto 252081
FANCF seqs_passing_all_filters R2 pam 293366

RNF2

make_qc_reads_plot(r1_r2_qc_df, "RNF2")
make_qc_table(r1_r2_qc_df, "RNF2")
target_site var lab proto_pam read_count
RNF2 seqs R1 proto 180327
RNF2 seqs R1 pam 222795
RNF2 seqs_with_primer R1 proto 170052
RNF2 seqs_with_primer R1 pam 213663
RNF2 seqs_with_umi R1 proto 141928
RNF2 seqs_with_umi R1 pam 199321
RNF2 seqs_with_libbarcode R1 proto 122152
RNF2 seqs_with_libbarcode R1 pam 151119
RNF2 seqs_with_target R1 proto 150423
RNF2 seqs_with_target R1 pam 166572
RNF2 seqs_passing_all_filters R1 proto 113197
RNF2 seqs_passing_all_filters R1 pam 143297
RNF2 seqs R2 proto 312944
RNF2 seqs R2 pam 286467
RNF2 seqs_with_primer R2 proto 299683
RNF2 seqs_with_primer R2 pam 276410
RNF2 seqs_with_umi R2 proto 249380
RNF2 seqs_with_umi R2 pam 261450
RNF2 seqs_with_libbarcode R2 proto 212667
RNF2 seqs_with_libbarcode R2 pam 203807
RNF2 seqs_with_target R2 proto 260487
RNF2 seqs_with_target R2 pam 221851
RNF2 seqs_passing_all_filters R2 proto 201734
RNF2 seqs_passing_all_filters R2 pam 194637

ONE-SEQ SCORE (replicate scatter plots)

All barcodes that were found in both compared datasets are plotted as scatterplots to evaluate the similarity in ONE-Seq score. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.

########################################################
## CURATE R1 DF
########################################################
r1_combined_oneseq_score_plot_df <- r1_combined_df %>%
  select(barcode, target_site, min_oneseq_score) %>%
  rename(R1 = min_oneseq_score)

########################################################
## CURATE R1 DF
########################################################
r2_oneseq_score_plot_df <- r2_combined_df %>%
  select(barcode, target_site, min_oneseq_score) %>%
  rename(R2 = min_oneseq_score)

########################################################
## JOIN R1 AND R2 DFS
########################################################
r1_r2_oneseq_score_plot_df <- r1_combined_oneseq_score_plot_df %>%
  inner_join(r2_oneseq_score_plot_df) %>%
  ## comparing only those seqs in common between datasets
  drop_na() %>%
  group_by(target_site) %>%
  arrange(target_site)
## Joining, by = c("barcode", "target_site")
########################################################
## SMALL VIEW OF COMBINED DF
########################################################
glimpse(r1_r2_oneseq_score_plot_df)
## Rows: 72,451
## Columns: 4
## Groups: target_site [3]
## $ barcode     <chr> "GCTCAGCATCGATA", "GATGCGTGTGTGAA", "CGTACTTCTGTGCA", "GAA…
## $ target_site <chr> "EMX1", "EMX1", "EMX1", "EMX1", "EMX1", "EMX1", "EMX1", "E…
## $ R1          <dbl> 1.00023867, 1.00000000, 0.91170466, 0.74217450, 0.58488012…
## $ R2          <dbl> 0.96218068, 1.00000000, 0.80655335, 0.69925173, 0.55024229…

R1 vs. R2

## FUNCTION DEFINED IN R1 SECTION
make_oneseq_scatter_plot(r1_r2_oneseq_score_plot_df, "R1", "R2")

## FUNCTION DEFINED IN R1 SECTION
rm(r1_r2_oneseq_score_plot_df)

NORMALIZED COUNTS (replicate scatter plots)

Compare the normalized proto & pam counts that are found in the scores.tsv files. All R1 triplicates and all R2 triplicates are reduced (minimum value) and used to create two groups to compare. See main R1-R2 section for details on how this accomplished.

NOTE only barcodes present in both samples are plotted.

## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
#############################################
## DF CURATION
#############################################
####################### R1
## PROTO
r1_normalized_counts_proto_df <- r1_r2_scores_df %>%
  select(barcode, min_norm_proto_count, target_site) %>%
  rename(R1 = min_norm_proto_count)

## PAM
r1_normalized_counts_pam_df <- r1_r2_scores_df %>%
  select(barcode, min_norm_pam_count, target_site) %>%
  rename(R1 = min_norm_pam_count)

#####################  R2
## PROTO
r2_normalized_counts_proto_df <- r1_r2_scores_df %>%
  select(barcode, min_norm_proto_count, target_site) %>%
  rename(R2 = min_norm_proto_count)

## PAM
r2_normalized_counts_pam_df <- r1_r2_scores_df %>%
  select(barcode, min_norm_pam_count, target_site) %>%
  rename(R2 = min_norm_pam_count)

#############################################
## JOIN R1/R2 DFS
#############################################
r1_r2_normalized_counts_pam_df <- r1_normalized_counts_pam_df %>%
  left_join(r2_normalized_counts_pam_df)

r1_r2_normalized_counts_proto_df <- r1_normalized_counts_proto_df  %>%
  left_join(r2_normalized_counts_proto_df)

R1 vs. R2

## FUNCTION DEFINED IN R1 SECTION
make.proto.norm.plot(r1_r2_normalized_counts_proto_df, "R1", "R2")

make.pam.norm.plot(r1_r2_normalized_counts_pam_df, "R1", "R2")

## FUNCTION DEFINED IN R1 SECTION
rm(r1_r2_normalized_counts_proto_df)
rm(r1_r2_normalized_counts_pam_df)

RAW COUNTS (replicate scatter plots)

Compare the raw proto & pam counts that are calculated from the scores.tsv files. All R1 triplicates and all R2 triplicates are reduced (minimum value) and used to create two groups to compare. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.

NOTE only barcodes present in both samples are plotted.

## curate summary df to plot normalized pam + proto counts for each lab & target site
## retain only necessary cols, barcode will be identical between two datasets
## drop any rows that contain NA becaue those were not found in one of the datasets
## comparing only those seqs in common between datasets
#############################################
## DF CURATION
#############################################
####################### R1
## PROTO
r1_raw_counts_proto_df <- r1_r2_scores_df %>%
  select(barcode, min_proto_raw_read_count, target_site) %>%
  rename(R1 = min_proto_raw_read_count)

## PAM
r1_raw_counts_pam_df <- r1_r2_scores_df %>%
  select(barcode, min_pam_raw_read_count, target_site) %>%
  rename(R1 = min_pam_raw_read_count)

####################### R2
## PROTO
r2_raw_counts_proto_df <- r1_r2_scores_df %>%
  select(barcode, min_proto_raw_read_count, target_site) %>%
  rename(R2 = min_proto_raw_read_count)

## PAM
r2_raw_counts_pam_df <- r1_r2_scores_df %>%
  select(barcode, min_pam_raw_read_count, target_site) %>%
  rename(R2 = min_pam_raw_read_count)

#############################################
## JOIN R1 AND R2 DFS
#############################################
r1_r2_raw_counts_pam_df <- r1_raw_counts_pam_df %>%
  left_join(r2_raw_counts_pam_df)

r1_r2_raw_counts_proto_df <- r1_raw_counts_proto_df  %>%
  left_join(r2_raw_counts_proto_df)

R1 vs. R2

## FUNCTION DEFINED IN R1 SECTION
make.proto.raw.counts.plot(r1_r2_raw_counts_proto_df, "R1", "R2")

make.pam.raw.counts.plot(r1_r2_raw_counts_pam_df, "R1", "R2")

#############################################
## REMOVE OBJETCS
#############################################
rm(r1_r2_raw_counts_proto_df)
rm(r1_r2_raw_counts_pam_df)

VENN BAR: PER TARGET BARCODE COMPARISON

TRIPLICATES COMBINED

All R1 triplicates and all R2 triplicates are reduced (minimum value) and used to create two groups to compare. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.

cbPalette <- c("#E69F00",
                "#56B4E9", 
                "#000000")

#############################################
## DF CURATION
#############################################
r1_venn_df <- r1_combined_df %>%
  select(barcode, target_site, dataset)

r2_venn_df <- r2_combined_df %>%
  select(barcode, target_site, dataset) 

#############################################
## DF JOIN
#############################################
r1_r2_venn_df <- plyr::rbind.fill(r1_venn_df, r2_venn_df) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-"))

#############################################
## PLOT
#############################################
ggplotly(ggplot(data = r1_r2_venn_df) + 
  geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set), 
           # position = "fill"
           position = "fill") + 
  coord_flip() + 
  theme_bw() + 
  labs(x = "Target Site", 
       y = "Proportion of Sites", 
       fill = "Lab Combinations", 
       title = "R1 vs. R2 Replicate Barcodes") +
    scale_fill_manual(values=cbPalette))
#############################################
## TABLE
#############################################
(r1_r2_venn_table_df <- r1_r2_venn_df  %>% 
  group_by(target_site) %>% 
  count(lab_set)  %>% 
  group_by(target_site) %>% 
  mutate(target_site_total = sum(n)) %>% 
  mutate(pct = round(n/target_site_total * 100, 0),
         count_value = paste0(n, " (", pct, "%)")) %>% 
  select(-n, -pct) %>% 
  pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site target_site_total R1 R1-R2 R2
EMX1 48558 5950 (12%) 35712 (74%) 6896 (14%)
FANCF 26102 5330 (20%) 16814 (64%) 3958 (15%)
RNF2 25204 1942 (8%) 19925 (79%) 3337 (13%)
########################################################################
# REMOVE UNNECESSARY OBJECTS FROM ENVIRONMENT 
########################################################################
rm(r1_venn_df)
rm(r2_venn_df)

BARCODE CONCORDANCE TABLE

Table that breaks down the number of barcodes that are:

  • Not concordant (present in only 1 replicate)
  • Concordant between 2 replicates
  • Concordant in all 3 replicates

NOTE This is based on at least 1 [calculated] raw read to be considered present in the sample. In the future we may want to increase the read threshold to be considered.

########################################################################
# DF CURATION 
########################################################################
r1_temp_df <- r1_combined_df %>%
  select(barcode, target_site, min_proto_raw_read_count,
         min_pam_raw_read_count, dataset) 
 
r2_temp_df <- r2_combined_df %>%
  select(barcode, target_site, min_proto_raw_read_count,
         min_pam_raw_read_count, dataset) 

########################################################################
# JIN DFS
########################################################################
r1_r2_raw_df <- plyr::rbind.fill(r1_temp_df, r2_temp_df)

########################################################################
# FUNCTION 
########################################################################
make_concordance_table <- function(df, pam_proto){
  if(pam_proto == "proto"){
    (df %>%
      select(barcode, target_site, dataset, min_proto_raw_read_count) %>%
      ## include only those with reads of at least 1
      filter(min_proto_raw_read_count > 0) %>%
      select(-min_proto_raw_read_count) %>%
      group_by(target_site, barcode) %>%
      ## get lab set to figure out how many labs each barcode is present in
      summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-"))  %>%
      mutate(lab_num = str_count(lab_set, "-") + 1) %>%
      group_by(target_site, lab_num) %>%
      tally() %>%
      mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
      pivot_wider(names_from = lab_num, 
                  values_from = n, 
                  values_fill = NA) %>%
      kbl() %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed", 
                        "responsive", html_font = "Cambria")))
  } else if(pam_proto == "pam"){
    (df %>%
      select(barcode, target_site, dataset, min_pam_raw_read_count) %>%
      ## include only those with reads of at least 1
      filter(min_pam_raw_read_count > 0) %>%
      select(-min_pam_raw_read_count) %>%
      group_by(target_site, barcode) %>%
      ## get lab set to figure out how many labs each barcode is present in
      summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-"))  %>%
      mutate(lab_num = str_count(lab_set, "-") + 1) %>%
      group_by(target_site, lab_num) %>%
      tally() %>%
      mutate(lab_num = sub("^", "concordant_in_", lab_num)) %>%
      pivot_wider(names_from = lab_num, 
                  values_from = n, 
                  values_fill = NA) %>%
      kbl() %>%
      kable_styling(bootstrap_options = c("striped", "hover", "condensed", 
                        "responsive", html_font = "Cambria")))
  } else {
    message("Enter 'pam' or 'proto' for second arg")
  }
}

########################################################################
# REMOVE ENV OBJECTS 
########################################################################
rm(r1_temp_df)
rm(r2_temp_df)

PROTO

make_concordance_table(r1_r2_raw_df, "proto")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
target_site concordant_in_1 concordant_in_2
EMX1 19223 12945
FANCF 10265 6076
RNF2 10509 7600

PAM

make_concordance_table(r1_r2_raw_df, "pam")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
target_site concordant_in_1 concordant_in_2
EMX1 16593 27731
FANCF 10546 12324
RNF2 7820 15748
########################################################################
# REMOVE ENV OBJECTS 
########################################################################
rm(r1_r2_raw_df)

VENN BAR: 100% TRIPLICATE CONCORDANCE

Only barcodes that are present with at least 1 read count across all triplicates are considered. NOTE no count values are taken into consideration.

########################################################################
# COLOR PALETTE TO BE CONSISTENT WITH PREVIOUS SCHEME 
########################################################################
cbPalette2 <- c("#E69F00",
                "#56B4E9", 
                "#000000")
########################################################################
# DF CURATION 
########################################################################
r1 <- r1_scores_df %>%
  select(barcode, target_site, lab, proto_raw_read_count) %>%
  ## include only those with reads of at least 1
  filter(proto_raw_read_count > 0) %>%
  select(-proto_raw_read_count) %>%
  group_by(target_site, barcode) %>%
  ## get lab set to figure out how many labs each barcode is present in
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))  %>%
  mutate(lab_num = str_count(lab_set, "-") + 1) %>%
  filter(lab_num == 3) %>%
  select(-lab_set) %>%
  mutate(dataset = "R1")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
r2 <- r2_scores_df %>%
  select(barcode, target_site, lab, proto_raw_read_count) %>%
  ## include only those with reads of at least 1
  filter(proto_raw_read_count > 0) %>%
  select(-proto_raw_read_count) %>%
  group_by(target_site, barcode) %>%
  ## get lab set to figure out how many labs each barcode is present in
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))  %>%
  mutate(lab_num = str_count(lab_set, "-") + 1) %>%
  filter(lab_num == 3) %>%
  select(-lab_set) %>%
  mutate(dataset = "R2")
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
########################################################################
# JOIN DFS 
########################################################################
r1_r2 <- plyr::rbind.fill(r1, r2) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(dataset)), collapse = "-"))
## `summarise()` has grouped output by 'target_site'. You can override using the `.groups` argument.
########################################################################
# PLOT 
########################################################################
ggplotly(ggplot(data = r1_r2) + 
  geom_bar(aes(x = fct_rev(as_factor(target_site)), fill = lab_set), 
           # position = "fill"
           position = "fill") + 
  coord_flip() + 
  theme_bw() + 
  labs(x = "Target Site", 
       y = "Proportion of Sites", 
       fill = "Lab Combinations", 
       title = "R1 vs. R2 Replicate Barcodes") +
    scale_fill_manual(values=cbPalette2))
########################################################################
# TABLE 
########################################################################
(r1_r2_venn_table_df <- r1_r2  %>% 
  group_by(target_site) %>% 
  count(lab_set)  %>% 
  group_by(target_site) %>% 
  mutate(target_site_total = sum(n)) %>% 
  mutate(pct = round(n/target_site_total * 100, 0),
         count_value = paste0(n, " (", pct, "%)")) %>% 
  select(-n, -pct) %>% 
  pivot_wider(names_from = "lab_set", values_from = "count_value")%>%
  kbl() %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", html_font = "Cambria")))
target_site target_site_total R1 R1-R2 R2
EMX1 3927 1316 (34%) 1668 (42%) 943 (24%)
FANCF 1716 246 (14%) 929 (54%) 541 (32%)
RNF2 1632 396 (24%) 572 (35%) 664 (41%)
########################################################################
# REMOVE ENV OBJECTS 
########################################################################
rm(r1)
rm(r2)
rm(r1_r2)
rm(r1_r2_venn_table_df)

DENSITY PLOTS

TRIPLICATES COMBINED

Similar to the venn bar plots, visualize barcode overlap between samples with raw read counts plotted. Triplicates are combined to create one set of R1 and one set of R2 data points. See main R1-R2 section for details on how this accomplished.

########################################################################
# CURATE R1 - R2 DFS 
########################################################################
r1_raw_reads_df <- r1_combined_df %>%
  select(barcode, target_site, min_proto_raw_read_count,
         min_pam_raw_read_count, dataset) %>%
  rename(lab = dataset, 
         proto_raw_read_count = min_proto_raw_read_count, 
         pam_raw_read_count = min_pam_raw_read_count)

r2_raw_reads_df <- r2_combined_df %>%
  select(barcode, target_site, min_pam_raw_read_count,
         min_proto_raw_read_count, dataset) %>%
  rename(lab = dataset, 
         proto_raw_read_count = min_proto_raw_read_count, 
         pam_raw_read_count = min_pam_raw_read_count)

########################################################################
# JOIN DFS 
########################################################################
r1_r2_raw_counts_df <- plyr::rbind.fill(r1_raw_reads_df, r2_raw_reads_df)

########################################################################
# REMOVE ENV OBJECTS 
########################################################################
rm(r1_raw_reads_df)
rm(r2_raw_reads_df)

PROTO

## FUNCTIONS DEFINED IN R1 SECTION
lab_list <- c("R1", "R2")
ggplotly(make.proto.density.plots(r1_r2_raw_counts_df, lab_list))

PAM

## FUNCTIONS DEFINED IN R1 SECTION
lab_list <-  c("R1", "R2")
ggplotly(make.pam.density.plots(r1_r2_raw_counts_df, lab_list))

DENSITY PLOT : 100% TRIPLICATE CONCORDANCE

Only barcodes that are present with at least 1 read count across all triplicates are considered.

##############################################
## GET BARCODES OF 100% CONCORDANT TRIPLICATE
##############################################

r1 <- r1_scores_df %>%
  select(barcode, target_site, lab, proto_raw_read_count) %>%
  ## include only those with reads of at least 1
  filter(proto_raw_read_count > 0) %>%
  group_by(target_site, barcode) %>%
  ## get lab set to figure out how many labs each barcode is present in
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))  %>%
  mutate(lab_num = str_count(lab_set, "-") + 1) %>%
  filter(lab_num == 3) %>%
  select(-lab_set) %>%
  mutate(dataset = "R1")
      
 
r2 <- r2_scores_df %>%
  select(barcode, target_site, lab, proto_raw_read_count) %>%
  ## include only those with reads of at least 1
  filter(proto_raw_read_count > 0) %>%
  select(-proto_raw_read_count) %>%
  group_by(target_site, barcode) %>%
  ## get lab set to figure out how many labs each barcode is present in
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))  %>%
  mutate(lab_num = str_count(lab_set, "-") + 1) %>%
  filter(lab_num == 3) %>%
  select(-lab_set) %>%
  mutate(dataset = "R2")
 
##############################################
## GENERATE MIN COUNT DFS
##############################################
##################### PROTO #################
r1_concordant_proto_counts <- r1_scores_df %>%
  select(barcode, target_site, lab, proto_raw_read_count) %>%
  pivot_wider(names_from = lab, 
              values_from = proto_raw_read_count, 
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(proto_raw_read_count = pmin(NNN03m, 
                                    NNN04m,
                                    NNN05m,
                                    na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m) %>%
  filter(barcode %in% r1$barcode) %>%
  mutate(lab = "R1")

r2_concordant_proto_counts <- r2_scores_df %>%
  select(barcode, target_site, lab, proto_raw_read_count) %>%
  pivot_wider(names_from = lab, 
              values_from = proto_raw_read_count, 
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(proto_raw_read_count = pmin(NNN06m, 
                                    NNN07m,
                                    NNN08m,
                                    na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m) %>%
  filter(barcode %in% r2$barcode) %>%
  mutate(lab = "R2")

##################### PAM #################
r1_concordant_pam_counts <- r1_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count) %>%
  pivot_wider(names_from = lab, 
              values_from = pam_raw_read_count, 
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(pam_raw_read_count = pmin(NNN03m, 
                                  NNN04m,
                                  NNN05m,
                                  na.rm=TRUE)) %>%
  select(-NNN03m, -NNN04m, -NNN05m) %>%
  filter(barcode %in% r1$barcode) %>%
  mutate(lab = "R1")

r2_concordant_pam_counts <- r2_scores_df %>%
  select(barcode, target_site, lab, pam_raw_read_count) %>%
  pivot_wider(names_from = lab, 
              values_from = pam_raw_read_count, 
              values_fill = NA) %>%
  ## make all 0's NA so it isn't considered in the pmin function
  mutate_if(is.numeric, ~replace(., . == 0, NA)) %>%
  mutate(pam_raw_read_count = pmin(NNN06m, 
                                  NNN07m,
                                  NNN08m,
                                  na.rm=TRUE)) %>%
  select(-NNN06m, -NNN07m, -NNN08m) %>%
  filter(barcode %in% r2$barcode) %>%
  mutate(lab = "R2")

##############################################
## GET 1 R1 R2 DF 
##############################################
r1_r2_concordant_proto_counts <- plyr::rbind.fill(r1_concordant_proto_counts, 
                                                  r2_concordant_proto_counts)

r1_r2_concordant_pam_counts <- plyr::rbind.fill(r1_concordant_pam_counts, 
                                                r2_concordant_pam_counts)

########################################################################
# REMOVE ENV OBJECTS 
########################################################################
rm(r1)
rm(r2)
rm(r1_concordant_proto_counts)
rm(r2_concordant_proto_counts)
rm(r1_concordant_pam_counts)
rm(r2_concordant_pam_counts)

PROTO

lab_list <- c("R1", "R2")
ggplotly(make.proto.density.plots(r1_r2_concordant_proto_counts, lab_list))

PAM

lab_list <-  c("R1", "R2")
ggplotly(make.pam.density.plots(r1_r2_concordant_pam_counts, lab_list))

DENSITY PLOT: COMBINED PROTO & PAM

##############################################
## DF CURATION 
##############################################
r1_r2_combined_proto_pam_df <- r1_r2_raw_counts_df %>%
  mutate(min_proto_pam_raw_count = pmin(proto_raw_read_count,
                                        pam_raw_read_count, 
                                        na.rm=TRUE)) %>%
  select(-proto_raw_read_count, -pam_raw_read_count)

##############################################
## FUNCTION 
##############################################
make.combined.density.plots <- function(df, lablist){
  labs <- c(lablist)
  
  density_plot_labset_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site) %>%
  group_by(target_site, barcode) %>%
  summarise(lab_set = str_c(sort(unique(lab)), collapse = "-"))
  
  proto_density_plot_df <- df %>%
  filter(lab %in% labs) %>%
  select(barcode, lab, target_site, min_proto_pam_raw_count) %>%
  right_join(density_plot_labset_df) %>%
  count(target_site, lab_set, min_proto_pam_raw_count, name = "Occurrence")
  
  options(scipen = 999)
  
  plot <- ggplot(proto_density_plot_df) + 
    geom_histogram(aes(x = min_proto_pam_raw_count, fill = lab_set), 
                   position = "fill") + 
    scale_x_log10() + 
    facet_wrap(~target_site, scales = "free_y", ncol = 1) +
    theme_bw() +
    scale_fill_brewer(type = "qual") +
    labs(x = "Read Count", y = "Proportion of Sites", fill = "Lab Combinations", main = "Combined Proto & PAM Raw Read Counts") +
    scale_fill_manual(values=cbPalette)
  
  return(plot)
}

##############################################
## REMOVE ENV OBJECTS 
##############################################
lab_list <- c("R1", "R2")
make.combined.density.plots(r1_r2_combined_proto_pam_df, lab_list)